Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score,roc_curve
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from scipy import stats
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'
# Read the CSV file
df=pd.read_csv('bank-full.csv')
df.info() # Info about dataframe, columns, data types, Null values
print('Shape of the dataframe is:',df.shape) # Shape of data frame
print('-----First five rows are-----')
df.head() # First five rows
print('-----Last five rows are-----')
df.tail() # Last five rows
print('-----Number of unique values-----')
df.nunique() # Number of unique values in each column
print('-----Number of Nulls-----')
df.isnull().sum() # Check Null values
df.describe().transpose() # Statistical summary
# Convert to categorical data type
df['job']=df['job'].astype('category')
df['marital']=df['marital'].astype('category')
df['education']=df['education'].astype('category')
df['default']=df['default'].astype('category')
df['housing']=df['housing'].astype('category')
df['loan']=df['loan'].astype('category')
df['contact']=df['contact'].astype('category')
df['month']=df['month'].astype('category')
df['poutcome']=df['poutcome'].astype('category')
df['Target']=df['Target'].astype('category')
# Check datatyoes
df.dtypes
plt.figure(figsize=(14,5))
sns.distplot(df['age']);
df['age'].unique() # Check unique values
df['age'].isnull().sum() # Check null
df['age'].describe() # Check mean, min, max, std, quartiles
df['age'].median() # Median value
# Q1 and Q3 values from above
outliers_lower=33-1.5*(48-33)
outliers_upper=48+1.5*(48-33)
print(outliers_lower)
print(outliers_upper)
# Number of outliers
print('Number of outliers lower:',df[df['age']<outliers_lower]['age'].count()) # Lower
print('Number of outliers upper:',df[df['age']>outliers_upper]['age'].count()) # Upper
plt.figure(figsize=(3,5))
sns.boxplot(x='age',data=df, orient='v');
plt.figure(figsize=(20,10))
sns.countplot(df['job'],data=df)
plt.xlabel('Job', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['job'].unique() # Check unique values
df['job'].isnull().any() # Check null
df['job'].value_counts()
df['job'].value_counts(normalize=True) # value counts %age
plt.figure(figsize=(5,5))
sns.countplot(df['marital'],data=df)
plt.xlabel('Marital Status', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['marital'].unique() # Check unique values
df['marital'].isnull().any() # Check null
df['marital'].value_counts() # value counts
df['marital'].value_counts(normalize=True) # value counts %age
plt.figure(figsize=(5,5))
sns.countplot(df['education'],data=df)
plt.xlabel('Education', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['education'].unique() # Check unique values
df['education'].isnull().any() # Check null
df['education'].value_counts() # value counts
df['education'].value_counts(normalize=True) # value counts %age
plt.figure(figsize=(5,5))
sns.countplot(df['default'],data=df)
plt.xlabel('Credit in Default?', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['default'].unique() # Check unique values
df['default'].isnull().any() # Check null
df['default'].value_counts() # value counts
df['default'].value_counts(normalize=True) # value counts %age
plt.figure(figsize=(5,5))
sns.countplot(df['housing'],data=df)
plt.xlabel('Housing Loan?', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['housing'].unique() # Check unique values
df['housing'].isnull().any() # Check null
df['housing'].value_counts() # value counts
df['housing'].value_counts(normalize=True) # value counts %age
plt.figure(figsize=(5,5))
sns.countplot(df['loan'],data=df)
plt.xlabel('Personal Loan?', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['loan'].unique() # Check unique values
df['loan'].isnull().any() # Check null
df['loan'].value_counts() # value counts
df['loan'].value_counts(normalize=True) # value counts %age
plt.figure(figsize=(14,5))
sns.distplot(df['balance']);
df['balance'].isnull().any() # Check null
df['balance'].describe() # Check mean, min, max, std, quartiles
df['balance'].median() # Median value
# Q1 and Q3 values from above
outliers_lower=72-1.5*(1428-72)
outliers_upper=1428+1.5*(1428-72)
print(outliers_lower)
print(outliers_upper)
# Number of outliers
print('Number of outliers lower:',df[df['balance']<outliers_lower]['balance'].count()) # Lower
print('Number of outliers upper:',df[df['balance']>outliers_upper]['balance'].count()) # Upper
plt.figure(figsize=(7,7))
sns.boxplot(x='balance',data=df, orient='v');
plt.figure(figsize=(5,5))
sns.countplot(df['contact'],data=df)
plt.xlabel('Contact Type', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['contact'].unique() # Check unique values
df['contact'].isnull().any() # Check null
df['contact'].value_counts() # value counts
df['contact'].value_counts(normalize=True) # value counts %age
df['day'].unique() # Check unique values
df['day'].isnull().any() # Check null
df['day'].value_counts() # value counts
df['day'].value_counts(normalize=True) # value counts %age
df['month'].unique() # Check unique values
df['month'].isnull().any() # Check null
df['month'].value_counts() # value counts
df['month'].value_counts(normalize=True) # value counts %age
plt.figure(figsize=(14,5))
sns.distplot(df['campaign']);
df['campaign'].isnull().any() # Check null
df['campaign'].describe() # Check mean, min, max, std, quartiles
df['campaign'].median() # Median value
# Q1 and Q3 values from above
outliers_lower=1-1.5*(3-1)
outliers_upper=3+1.5*(3-1)
print(outliers_lower)
print(outliers_upper)
# Number of outliers
print('Number of outliers lower:',df[df['balance']<outliers_lower]['balance'].count()) # Lower
print('Number of outliers upper:',df[df['balance']>outliers_upper]['balance'].count()) # Upper
df['pdays'].isnull().any() # Check null
df['pdays'].describe() # Check mean, min, max, std, quartiles
df['pdays'].median() # Median value
print('pdays is -1: {}'.format(df[df.pdays==-1].shape[0]))
We see that for majority of records the pdays value is -1 which means either the customer has never been contacted or it is more than 900 since last contact
df['previous'].isnull().any() # Check null
df['previous'].describe() # Check mean, min, max, std, quartiles
df['previous'].median() # Median value
plt.figure(figsize=(5,5))
sns.countplot(df['poutcome'],data=df)
plt.xlabel('Previous Campaign Outcome', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['poutcome'].unique() # Check unique values
df['poutcome'].isnull().any() # Check null
df['poutcome'].value_counts() # value counts
df['poutcome'].value_counts(normalize=True) # value counts %age
As we see from above the success rate from previous campaign was around 3% though for a majority of customers it is not known (81.74%)
plt.figure(figsize=(5,5))
sns.countplot(df['Target'],data=df)
plt.xlabel('Did Client Subscribed Term Deposit', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
df['Target'].unique() # Check unique values
df['Target'].isnull().any() # Check null
df['Target'].value_counts() # value counts
df['Target'].value_counts(normalize=True) # value counts %age
# So far we have seen unknown/mising data for job and education which can impact our target variable
# Let's check job data again
df['job'].value_counts()
# From above we see for 288 customers we do not have their job information
# Let's check the number of customers who are above 60
df['job'][df['age']>60][df['job']=='unknown'].value_counts()
# As we see from above 22 customers have no job information and are above the age of 60
# hence, we can move them to retired category
df.loc[(df['age']>60) & (df['job']=='unknown'), 'job'] = 'retired'
# Let's compare job with education to infer if there a corerelation and we can identify/infer unknown values
pd.crosstab(df['job'], df['education'])
# As we can infer from above, in case a person has primary education only then mostly they have blue-collar jobs
# Similarly, we see a person with tertiary education mostly have management jobs
# We are unable to infer similar undertstanding for people with secondary education as the spread is across technician,
# blue-collar and admin. jobs
# Hence, moving customers with primary education and unknown job to blue-collar job
# and customers with tertiary education and unknown job to management job
df.loc[(df['education']=='primary') & (df['job']=='unknown'), 'job'] = 'blue-collar'
df.loc[(df['education']=='tertiary') & (df['job']=='unknown'), 'job'] = 'management'
pd.crosstab(df['education'], df['job'])
# As we can see above, we can infer below understanding
# --> People in admin. job mostly have secondary education
# --> People in management job mostly have tertiary education
# --> People in services job mostly have secondary education
# --> People in technician job mostly have secondary education
# --> People in housemaid job mostly have primary education
# Hence, making changes as per this understanding/inference
df.loc[(df['job']=='admin.') & (df['education']=='unknown'), 'education'] = 'secondary'
df.loc[(df['job']=='management') & (df['education']=='unknown'), 'education'] = 'tertiary'
df.loc[(df['job']=='services') & (df['education']=='unknown'), 'education'] = 'secondary'
df.loc[(df['job']=='technician') & (df['education']=='unknown'), 'education'] = 'secondary'
df.loc[(df['job']=='housemaid') & (df['education']=='unknown'), 'education'] = 'primary'
# Categorize jobs and club into fewer classes
# We can club admin into management, housemaid into blue-collar and self employed with entrepreneur
mapping = {'admin.':'management', 'housemaid':'blue-collar', 'self-employed':'entrepreneur'}
df['job'] = df['job'].replace(mapping)
df['job'].value_counts()
df['job'].value_counts(normalize=True) # value counts %age
df['job']=df['job'].astype('category')
# Earlier we saw that pdays had -1 for a huge number of records which means either the customer has never been contacted
# or it is more than 900 days
pd.crosstab(df['pdays'],df['poutcome'])
df['pdays_zero'] = 0
df['pdays_less_100'] = 0
df['pdays_great_100'] = 0
df['pdays_zero'][df['pdays']==-1] = 1
df['pdays_less_100'][(df['pdays']>-1) & (df['pdays']<=100)] = 1
df['pdays_great_100'][df['pdays']>100] = 1
df = df.drop('pdays', axis=1)
df.head()
From our analysis above we saw outliers for age, balance and campaign. For age and campaign though there were outliers but there was not much variance between mean and median values and they seem to be real world situations, hence we will not treat them.
However, for balance we have a big difference in mean and median values, hence we will try to treat outliers for balance.
From our analysis above, we had:
As we see above the number of upper range outliers is more than 10% as compared to overall data available, hence we need to transform for a better prediction model
# Lets look at balance data again
plt.figure(figsize=(14,5))
sns.distplot(df['balance']);
As we see -ve balances in the dataset, we can safely mark them as zero balance as customers with higher balance only will opt for term deposits. Hence, converting -ve balances to zero. Also, balance data is right skewed, and we are taking a hypothesis that customers with higher balances will opt for term deposits. We are converting all balances more than upper outlier boundary to upper boundary.
df.loc[df.balance<0,'balance'] = 0
df.loc[df.balance>3462,'balance'] = 3462
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False,
cmap="YlGnBu")
plt.show();
plt.figure(figsize=(20,5))
sns.pairplot(df, diag_kind='kde', diag_kws={'bw':'1.0'})
plt.show();
# Target vs Age
sns.boxplot(x='Target', y='age', data=df)
plt.show();
# Target vs Age
sns.barplot(x='Target', y='age', data=df)
plt.show();
# Target vs balance
sns.boxplot(x='Target', y='balance', data=df)
plt.show();
# Target vs balance
sns.barplot(x='Target', y='balance', data=df)
plt.show();
sns.barplot(x='Target', y='duration', data=df)
plt.show();
sns.boxplot(x='Target', y='duration', data=df)
plt.show();
# As we can see from the plot above where duration of the call is longer more customers subscribed
# which is understandable as the subscription process would have taken more time
# it doesn't really reflect whether a longer duration impacted the decision of customer
# it seems the other way where a customer made a positive decision and hence a longer call duration
# dropping 'duration' as it's value will be known after call and doesn't seem to be impacting target variable for prediction
df=df.drop(['duration'],axis=1)
# Target vs Job
pd.crosstab(df['job'], df['Target'], normalize='index')
plt.figure(figsize=(20,5))
sns.countplot(x='job',hue='Target', data=df);
As we see from above highest percentages for customers accepting term deposit are (converion rate):
Overall people with job as management have more term deposits followed by technician and blue-collar jobs
# Target vs Marital
pd.crosstab(df['marital'], df['Target'], normalize='index')
sns.countplot(x='marital',hue='Target', data=df);
As we see from above highest percentages for customers accepting term deposit are (conversion rate):
Overall married customers have more term deposits followed by single customers
# Target vs Education
pd.crosstab(df['education'], df['Target'], normalize='index')
sns.countplot(x='education',hue='Target', data=df);
As we see from above highest percentages for customers accepting term deposit are (conversion rate):
Overall customers with secondary education have more term deposits followed by tertiary education
# Target vs Default
pd.crosstab(df['default'], df['Target'], normalize='index')
sns.countplot(x='default',hue='Target', data=df);
As we see from above customers who do not default on credit have a higher %age of accepting term deposit
# Target vs Housing
pd.crosstab(df['housing'], df['Target'], normalize='index')
sns.countplot(x='housing',hue='Target', data=df);
As we see from above customers who do not have home loan had higher %age accepting term deposits
# Target vs Loan
pd.crosstab(df['loan'], df['Target'], normalize='index')
sns.countplot(x='loan',hue='Target', data=df);
As we see from above customers who do not have personal loan had higher %age accepting term deposit
# Target vs Contact
pd.crosstab(df['contact'], df['Target'])
sns.countplot(x='contact',hue='Target', data=df);
We see from above that there are 13020 records where contact is Unkown. From that count 530 have Target variable as 'yes'. We also see that for majority where the Target variable is 'yes', the contact type was cellular. Hence, we can safely move unknown contacts where Target is 'yes' to contact as 'cellular'
df.loc[(df['contact']=='unknown') & (df['Target']=='yes'), 'contact'] = 'cellular'
le = preprocessing.LabelEncoder()
df.job = le.fit_transform(df.job)
df.marital = le.fit_transform(df.marital)
df.education = le.fit_transform(df.education)
df.default = le.fit_transform(df.default)
df.housing = le.fit_transform(df.housing)
df.loan = le.fit_transform(df.loan)
df.contact = le.fit_transform(df.contact)
df.month = le.fit_transform(df.month)
df.poutcome = le.fit_transform(df.poutcome)
df.Target = le.fit_transform(df.Target)
df.describe().T
df['Target'].value_counts(normalize=True)
df_copy=df.copy()
df_2copy=df_copy[df.Target==1]
df_copy=pd.concat([df_copy, df_2copy])
df_copy=pd.concat([df_copy, df_2copy])
df_copy=pd.concat([df_copy, df_2copy])
df_copy=pd.concat([df_copy, df_2copy])
df_copy=pd.concat([df_copy, df_2copy])
df_copy=pd.concat([df_copy, df_2copy])
df_copy=pd.concat([df_copy, df_2copy])
df=df_copy
df['Target'].value_counts(normalize=True)
x=df.drop('Target',axis=1) # Independent variables
y=df['Target'] # Dependent variable
features = [col for col in df.columns if col != 'Target']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,random_state=1) # Split data in 70:30 ratio
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
logreg = LogisticRegression(random_state=1)
logreg.fit(x_train, y_train) # Fit training dataset
print("Train: %.2f" % logreg.score(x_train, y_train)) # performance on train data
print("Test: %.2f" % logreg.score(x_test, y_test)) # performance on test data
y_predict = logreg.predict(x_test) # Get predicted
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
print()
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
from sklearn.model_selection import RandomizedSearchCV
penalty = ['l1', 'l2']
C = [0.01,0.1,0.25,0.5,0.75,1]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
hyperparameters = dict(C=C, penalty=penalty, solver = solver)
logistic = LogisticRegression()
final_model = RandomizedSearchCV(logistic, hyperparameters, verbose=0)
final_model.fit(x_train,y_train)
y_predict = final_model.predict(x_test)
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
print()
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame({'Method':['Logistic Regression'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=1 )
dt.fit(x_train, y_train)
print("Train: %.2f" % dt.score(x_train, y_train)) # performance on train data
print("Test: %.2f" % dt.score(x_test, y_test)) # performance on test data
dot_data = StringIO()
export_graphviz(dt, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = features,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('TD_Sales.png')
Image(graph.create_png())
y_predict = dt.predict(x_test)
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Decision Tree'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
dt_pruned = DecisionTreeClassifier(criterion = "entropy", max_depth=6)
dt_pruned.fit(x_train, y_train)
print("Train: %.2f" % dt_pruned.score(x_train, y_train)) # performance on train data
print("Test: %.2f" % dt_pruned.score(x_test, y_test)) # performance on test data
dot_data = StringIO()
export_graphviz(dt_pruned, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = features,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('TD_Sales_Pruned.png')
Image(graph.create_png())
y_predict = dt_pruned.predict(x_test)
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
feat_importance = dt_pruned.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(features, dt_pruned.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
From above table we can infer that contact and month impact our target variable most.
tempResultsDf = pd.DataFrame({'Method':['Pruned Decision Tree'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(x_train, y_train)
print("Train: %.2f" % rfcl.score(x_train, y_train)) # performance on train data
print("Test: %.2f" % rfcl.score(x_test, y_test)) # performance on test data
y_predict = rfcl.predict(x_test)
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print(random_grid)
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(x_train, y_train)
rf_random.best_params_
y_predict = rf_random.predict(x_test)
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
bgcl = BaggingClassifier(n_estimators=50, max_samples= .5, bootstrap=True, oob_score=True, random_state=22)
bgcl = bgcl.fit(x_train, y_train)
print("Train: %.2f" % bgcl.score(x_train, y_train)) # performance on train data
print("Test: %.2f" % bgcl.score(x_test, y_test)) # performance on test data
y_predict = bgcl.predict(x_test)
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'bootstrap': bootstrap}
print(random_grid)
bgcl = BaggingClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
bgcl_random = RandomizedSearchCV(estimator = bgcl, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
bgcl_random.fit(x_train, y_train)
bgcl_random.best_params_
y_predict = bgcl_random.predict(x_test)
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
abcl = AdaBoostClassifier(n_estimators = 100, learning_rate=0.1, random_state=22)
abcl = abcl.fit(x_train, y_train)
print("Train: %.2f" % abcl.score(x_train, y_train)) # performance on train data
print("Test: %.2f" % abcl.score(x_test, y_test)) # performance on test data
y_predict = abcl.predict(x_test)
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Method of selecting samples for training each tree
random_grid = {'n_estimators': n_estimators}
print(random_grid)
abcl = BaggingClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
abcl_random = RandomizedSearchCV(estimator = abcl, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
abcl_random.fit(x_train, y_train)
y_predict = abcl_random.predict(x_test)
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Adaboost'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22)
gbcl = gbcl.fit(x_train, y_train)
print("Train: %.2f" % gbcl.score(x_train, y_train)) # performance on train data
print("Test: %.2f" % gbcl.score(x_test, y_test)) # performance on test data
y_predict = gbcl.predict(x_test)
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print()
print(classification_report(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Gradient boost'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
The model will give 98% correct prediction for customers who will subscribe to term deposit.